/*
# Copyright (c) 2022 Shenzhen Kaihong Digital Industry Development Co., Ltd.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*/

#include "lite/api/tools/benchmark_light.h"

#include <algorithm>
#include <cstdio>
#include <fstream>
#include <iomanip>
#include <map>
#include <memory>
#include <numeric>
#include <string>
#include <utility>
#include <vector>

#include "lite/core/version.h"
#include "lite/utils/timer.h"
using namespace paddle::lite_api;
using namespace paddle;
int main(int argc, char* argv[]) {
  shape_t input_shape{1, 3, 224, 224};
  int repeats = 10;
  int warmup = 10;
  int threads = 1;
  int power_mode = 1;
  if (argc > 2 && argc < 10) {
    std::cerr << "usage: ./" << argv[0] << "\n"
              << "  <naive_buffer_model_dir>\n"
              << "  <input_n>\n"
              << "  <input_c>\n"
              << "  <input_h>\n"
              << "  <input_w>\n"
              << "  <repeats>\n"
              << "  <warmup>\n"
              << "  <threads>\n"
              << "  <power_mode>\n"
              << std::endl;
    return 0;
  }
  std::string opt_model_file = argv[1];
  if (argc >= 9) {
    input_shape[0] = atoi(argv[2]);
    input_shape[1] = atoi(argv[3]);
    input_shape[2] = atoi(argv[4]);
    input_shape[3] = atoi(argv[5]);
    repeats = atoi(argv[6]);
    warmup = atoi(argv[7]);
    threads = atoi(argv[8]);
    power_mode = atoi(argv[9]);
  }
  Run(opt_model_file, input_shape, repeats, warmup, threads, power_mode);
}
void RunImpl(std::shared_ptr<PaddlePredictor> predictor, PerfData* perf_data) {
  lite::Timer timer;
  timer.Start();
  predictor->Run();
  perf_data->set_run_time(timer.Stop());
}

void Run(const std::string& model_file,
         const shape_t& input_shape,
         const int repeats,
         const int warmup,
         const int threads,
         const int power_mode) {
  MobileConfig config;
  config.set_model_from_file(model_file);
  config.set_threads(threads);
  config.set_power_mode(static_cast<PowerMode>(power_mode));

  lite::Timer timer;
  PerfData perf_data;
  perf_data.init(repeats);

  // Create predictor
  timer.Start();
  auto predictor = CreatePaddlePredictor(config);
  perf_data.set_init_time(timer.Stop());
  // set input
  std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
  input_tensor->Resize(
      {input_shape[0], input_shape[1], input_shape[2], input_shape[3]});
  auto* data = input_tensor->mutable_data<float>();
  for (int i = 0; i < ShapeProduction(input_tensor->shape()); ++i) {
    data[i] = 1;
  }

  // Warmup
  for (int i = 0; i < warmup; ++i) {
    RunImpl(predictor, &perf_data);
    timer.SleepInMs(0.000001);
  }

  // Run
  for (int i = 0; i < repeats; ++i) {
    RunImpl(predictor, &perf_data);
    timer.SleepInMs(0.000001);
  }

  // Get output
  size_t output_tensor_num = predictor->GetOutputNames().size();
  std::stringstream out_ss;
  out_ss << "output tensor num: " << output_tensor_num;

  for (size_t tidx = 0; tidx < output_tensor_num; ++tidx) {
    std::unique_ptr<const Tensor> output_tensor = predictor->GetOutput(tidx);
    out_ss << "\n--- output tensor " << tidx << " ---\n";
    auto out_shape = output_tensor->shape();
    auto out_data = output_tensor->data<float>();
    auto ele_num = lite::ShapeProduction(out_shape);
    auto out_mean = lite::compute_mean<float>(out_data, ele_num);
    auto out_std_dev = lite::compute_standard_deviation<float>(
        out_data, ele_num, true, out_mean);

    out_ss << "output shape(NCHW): " << lite::ShapePrint(out_shape)
           << std::endl;
    out_ss << "output tensor " << tidx << " elem num: " << ele_num << std::endl;
    out_ss << "output tensor " << tidx << " mean value: " << out_mean
           << std::endl;
    out_ss << "output tensor " << tidx << " standard deviation: " << out_std_dev
           << std::endl;

    // show_output_elem
    for (int i = 0; i < ele_num; ++i) {
      out_ss << "out[" << tidx << "][" << i
             << "]:" << output_tensor->data<float>()[i] << std::endl;
    }
  }

  // Print benchmark info
  std::stringstream ss;
  ss.precision(3);
  ss << "\n======= Model Info =======\n";
  ss << "optimized_model_file: " << model_file << std::endl;
  ss << "\n======= Runtime Info =======\n";
  ss << "threads: " << threads << std::endl;
  ss << "warmup: " << warmup << std::endl;
  ss << "repeats: " << repeats << std::endl;
  ss << "run_delay(sec): " << 0.00001 << std::endl;

  ss << "\n======= Perf Info =======\n";
  ss << std::fixed << std::left;
  ss << "Time(unit: ms):\n";
  ss << "init  = " << std::setw(12) << perf_data.init_time() << std::endl;
  ss << "first = " << std::setw(12) << perf_data.first_time() << std::endl;
  ss << "min   = " << std::setw(12) << perf_data.min_run_time() << std::endl;
  ss << "max   = " << std::setw(12) << perf_data.max_run_time() << std::endl;
  ss << "avg   = " << std::setw(12) << perf_data.avg_run_time() << std::endl;
  std::cout << ss.str() << std::endl;
}
